
# V Final Exhibit - Figure 4 Intensity: Retrospective Monthly Spending by Age and Service Category, Decedent ----

#### Adding cost by main category and month from death date ##################-
# first dcast the data from dt_cost_for_exhibit by cost category and month
#     from death date, and add it to dt_for_exhibits.
# Then check that total cost after index date is equal to UTL_f365d_total_cost.
# At last, a long version of it is produce for easy working in ggplot, where 
#     every observation has 5*12 rows (one per each category and month), 
#     with zero where there isn't any cost for this obs in this category in 
#     this month

fig_Cancer_Decedent_Spending_By_Time_a<- function( data = dt_for_exhibits_cancer,
                        data_cost = dt_cost_for_exhibits_cancer,
                        intensity_wards,
                        hosp_a = "Low",
                        hosp_b = "High",
                        hosp_a_name = "Inpatient -\nLow Intensity", 
                        hosp_b_name = "Inpatient -\nHigh Intensity") {
  
dt_catCost_by_monthFromIndex_cnr <- merge(
  x = data[
    DMG_died_within_365d == "1",  
    .(id_var, S_index_date_XX, DMG_age, UTL_f365d_total_cost)
    ],
  y = data.table::dcast.data.table(
    data = data_cost[ DMG_died_within_365d == "1" & # Decedent only
                                                            cost_date >= S_index_date_XX & # cost only after index date
                                                            !(is.na(cost_date) & is.na(actual_cost))
                                                          ][
                                                            ,
                                                            .(cost_by_cat = sum(actual_cost, na.rm = T)),
                                                            by = .(id_var, S_index_date_XX, new_cat, DMG_date_of_death_XX,
                                                                   month_from_indexDate = floor(
                                                                     as.numeric(difftime(cost_date,
                                                                                         S_index_date_XX,
                                                                                         units = "days")) / 30
                                                                     
                                                                   )
                                                            )
                                                              , month_from_indexDate := month_from_indexDate + 1
                                                              ][ month_from_indexDate == 13, month_from_indexDate := 12
                                                                ] ,
    formula = id_var + S_index_date_XX + DMG_date_of_death_XX 
    ~  new_cat + month_from_indexDate,
    value.var = "cost_by_cat",
    # in 12 month, there are duplicates as month 13 where unified with 12, 
    # thus "sum" is used
    fun.aggregate = sum 
  ),
  by = c("id_var", "S_index_date_XX"),
  all = T
)
cost_varitables <-setdiff( names(dt_catCost_by_monthFromIndex_cnr),
                           names(dt_catCost_by_monthFromIndex_cnr)[1:5])

for (j in cost_varitables) {
  dt_catCost_by_monthFromIndex_cnr[is.na(get(j)), (j) := 0]
}


# check that sum up all category equal to total cost. Since all costs more than
# one month after death were dropped, there will be some deviation but it shouldn't
# be in more than 5% of obs
dt_catCost_by_monthFromIndex_cnr[
  ,  
  tot_cost_diff := rowSums(.SD, na.rm = TRUE) - UTL_f365d_total_cost, 
  .SDcols = cost_varitables
  ]
if ((nrow(dt_catCost_by_monthFromIndex_cnr[tot_cost_diff <= -1]) + 
     nrow(dt_catCost_by_monthFromIndex_cnr[tot_cost_diff >   1])) /
    nrow(dt_catCost_by_monthFromIndex_cnr) > 0.05) {
  stop("Problem in making of dt_catCost_by_monthFromIndex_cnr")
}
dt_catCost_by_monthFromIndex_cnr[, tot_cost_diff := NULL]

cat_unique <- unique(data_cost[!is.na(new_cat)]$new_cat)

# generating long table for ggplot
dt_catCost_by_monthFromIndex_cnr_long <- NULL
for (cat in  cat_unique ) {
  dt_catCost_by_monthFromIndex_cnr_long <- rbind(
    dt_catCost_by_monthFromIndex_cnr_long,
    data.table::melt.data.table(
      data = dt_catCost_by_monthFromIndex_cnr,
      id.vars = c("id_var", "S_index_date_XX", "DMG_age",
                  "DMG_date_of_death_XX"),
      measure.vars = patterns(cat),
      variable.name = "category_and_month", 
      value.name = "cost"
    )
  )
}
dt_catCost_by_monthFromIndex_cnr_long[
  , `:=` (
    month_relative_to_indexDate = as.numeric(gsub("[^0-9,-]", "",  category_and_month)),
    category_temp = gsub("[0-9]", "",  category_and_month) 
  )
  ][
    , `:=` (
      category = factor(substr(category_temp, 1, nchar(category_temp) - 1),
                        levels = c(hosp_a, hosp_b,
                                   "Outpatient", "Drugs", "Imaging", "Others")),
      category_temp = NULL,
      category_and_month = NULL
    )
    ]

# check that final number of obs is correct:
if ((nrow(unique(dt_catCost_by_monthFromIndex_cnr_long, 
                 by = c("id_var", "S_index_date_XX"))) !=
     nrow(data[DMG_died_within_365d == "1"])) |
    (nrow(dt_catCost_by_monthFromIndex_cnr_long)/(6*12) != 
     nrow(data[DMG_died_within_365d == "1"]))){
  stop("Problem with generating long dt (before death cost)")
}

# add new variable - number of months between index date and death
dt_catCost_by_monthFromIndex_cnr_long[
  , 
  months_from_dxDate2deathDate := 
    as.numeric(difftime(DMG_date_of_death_XX, 
                        S_index_date_XX, 
                        units = "days")) / 30 
  ][ # once again, the following correction is needed as {365 %% 30 = 5}
    months_from_dxDate2deathDate > 12,
    months_from_dxDate2deathDate := 12
    ]

dt_for_exhibit6_index <- rbind(
  # by cat table
  dt_catCost_by_monthFromIndex_cnr_long[
    # keep only full month cost
    floor(months_from_dxDate2deathDate) >= month_relative_to_indexDate
    ][
      , .(sum_cost = sum(cost)),
      by = .(id_var, S_index_date_XX, month_relative_to_indexDate,
             category_short = factor(ifelse(category == hosp_a,
                                            hosp_a_name,
                                            ifelse(category==hosp_b,
                                                   hosp_b_name,
                                                   "All Other Services"))),
             months_from_dxDate2deathDate)
      ][
        ,
        .(ave_tot_cost = mean(sum_cost),
          num = .N),
        by = .(category_short, month_relative_to_indexDate, 
               full_months = floor(months_from_dxDate2deathDate))
        ][num >= min_obs_num],
  # total cost
  dt_catCost_by_monthFromIndex_cnr_long[
    # keep only full month cost
    floor(months_from_dxDate2deathDate) >= month_relative_to_indexDate
    ][
      , .(sum_cost = sum(cost)),
      by = .(id_var, S_index_date_XX, month_relative_to_indexDate,
             months_from_dxDate2deathDate)
      ][
        ,
        .(ave_tot_cost = mean(sum_cost),
          num = .N),
        by = .(month_relative_to_indexDate, 
               full_months = floor(months_from_dxDate2deathDate))
        ][num >= min_obs_num][, category_short := "All Services"]
)[, category_short := factor(category_short, 
                             levels = c("All Services", 
                                        hosp_a_name,
                                       hosp_b_name,
                                        "All Other Services"))]

# qa:
temp_qa <- dt_for_exhibit6_index[, .(sum(ave_tot_cost)), by = .(category_short)]
if ((temp_qa[category_short == "All Services"]$V1 / sum(temp_qa[category_short != "All Services"]$V1)) < 0.99) {
  stop("Problem in dt_for_exhibit6")
}
rm(temp_qa)

# drop 12 month:
dt_for_exhibit6_index <- dt_for_exhibit6_index[full_months!=12]

write.csv(dt_for_exhibit6_index,
          file = paste0("Cancer_Decedent_Spending_By_Time_a",hosp_a,"_",hosp_b,".csv"))


pdf(file = paste0("Cancer_Decedent_Spending_By_Time_a",hosp_a,"_",hosp_b,".pdf"))
print(ggplot(
  data = dt_for_exhibit6_index[category_short!="All Services"],
  aes(x = month_relative_to_indexDate, 
      y = ave_tot_cost, 
      group = full_months,
      color = full_months)
)+
  geom_line(size=1) +
  facet_grid( ~ category_short) +
  scale_x_continuous(breaks = seq(1, 12, 1), labels = seq(1, 12, 1)) + 
  scale_y_continuous(labels = scales::comma, breaks = seq(0,9000,3000), limits = c(0,10000)) +
  expand_limits(y = 0) + 
  scale_color_continuous(low = "midnightblue", high = "lightblue1",
                         name = "Full Months Survived",
                         trans="reverse",
                         breaks=c(1, 3, 5, 7, 9, 11),
                         guide=guide_colourbar(reverse = TRUE)) +
  labs(x = "Month From Initial Diagnosis",
       y = "Average Monthly Spending (NIS)")+
  theme(axis.text.x = element_text(size=base_size*0.7),
        text = element_text(size = 24))
)
dev.off()


}












# second version  - month from death  --------------------

#### Adding cost by main category and month from death date ##################-
# first dcast the data from dt_cost_for_exhibit by cost category and month
#     from death date, and add it to dt_for_exhibits.
# Then check that total cost after index date is equal to UTL_f365d_total_cost.
# At last, a long version of it is produce for easy working in ggplot, where 
#     every observation has 5*12 rows (one per each category and month), 
#     with zero where there isn't any cost for this obs in this category in 
#     this month
fig_Cancer_Decedent_Spending_By_Time_b <- function( data = dt_for_exhibits_cancer,
                        data_cost = dt_cost_for_exhibits_cancer,
                        intensity_wards,
                        hosp_a = "Low",
                        hosp_b = "High",
                        hosp_a_name = "Inpatient -\nLow Intensity", 
                        hosp_b_name = "Inpatient -\nHigh Intensity") {
  
dt_catCost_by_monthFromDeath_cnr <- merge(
  x = data[ DMG_died_within_365d == "1",  
    .(id_var, S_index_date_XX, DMG_age, UTL_f365d_total_cost)
    ],
  y = data.table::dcast.data.table(
    data = data_cost[DMG_died_within_365d == "1" & # Decedent only
                                                            cost_date >= S_index_date_XX & # cost only after index date
                                                            !(is.na(cost_date) & is.na(actual_cost))
                                                          ][,
                                                            .(cost_by_cat = sum(actual_cost, na.rm = T)),
                                                            by = .(id_var, S_index_date_XX, DMG_date_of_death_XX,new_cat,
                                                                   month_from_DeathDate = floor(
                                                                     as.numeric(difftime(DMG_date_of_death_XX,
                                                                                         cost_date, 
                                                                                         units = "days")) / 30
                                                                   )
                                                            )
                                                            ][ # drop cost observations that are dated more than 1 month after death
                                                              # (Only very small portion of total cost. Most probably typos)
                                                              month_from_DeathDate >= -1
                                                              ][ # if cost occures within one month after death date, it being treated
                                                                # as if it occured just before the death and due to some beaurocracy
                                                                # or worng paper work was registed as post mortem cost
                                                                month_from_DeathDate == -1, month_from_DeathDate := 0
                                                                ][ # add 1 to months after dx date (so the range will be 1-12 rather than 0-11)
                                                                  , month_from_DeathDate := month_from_DeathDate + 1
                                                                  ][ # since {365 %% 30 = 5}, couple of days fall with 13 month before death
                                                                    #         # despite the fact that they died within one year from dx date. Thus 
                                                                    # 13 month is now turn into 12
                                                                    month_from_DeathDate == 13, month_from_DeathDate := 12
                                                                    ] ,
    formula = id_var + S_index_date_XX + DMG_date_of_death_XX 
    ~  new_cat + month_from_DeathDate,
    value.var = "cost_by_cat",
    # in 12 month, there are duplicates as month 13 where unified with 12, 
    # thus "sum" is used
    fun.aggregate = sum 
  ),
  by = c("id_var", "S_index_date_XX"),
  all = T
)

cost_varitables <-setdiff( names(dt_catCost_by_monthFromDeath_cnr),
                           names(dt_catCost_by_monthFromDeath_cnr)[1:5])

for (j in cost_varitables) {
  dt_catCost_by_monthFromDeath_cnr[is.na(get(j)), (j) := 0]
}


# check that sum up all category equal to total cost. Since all costs more than
# one month after death were dropped, there will be some deviation but it shouldn't
# be in more than 5% of obs
dt_catCost_by_monthFromDeath_cnr[
  ,  
  tot_cost_diff := rowSums(.SD, na.rm = TRUE) - UTL_f365d_total_cost, 
  .SDcols = cost_varitables
  ]
if ((nrow(dt_catCost_by_monthFromDeath_cnr[tot_cost_diff <= -1]) + 
     nrow(dt_catCost_by_monthFromDeath_cnr[tot_cost_diff >   1])) /
    nrow(dt_catCost_by_monthFromDeath_cnr) > 0.05) {
  stop("Problem in making of dt_catCost_by_monthFromDeath_cnr")
}
dt_catCost_by_monthFromDeath_cnr[, tot_cost_diff := NULL]

cat_unique <- unique(data_cost[!is.na(new_cat)]$new_cat)

# generating long table for ggplot
dt_catCost_by_monthFromDeath_cnr_long <- NULL
for (cat in cat_unique) {
  dt_catCost_by_monthFromDeath_cnr_long <- rbind(
    dt_catCost_by_monthFromDeath_cnr_long,
    data.table::melt.data.table(
      data = dt_catCost_by_monthFromDeath_cnr,
      id.vars = c("id_var", "S_index_date_XX", "DMG_age",
                  "DMG_date_of_death_XX"),
      measure.vars = patterns(cat),
      variable.name = "category_and_month", 
      value.name = "cost"
    )
  )
}
dt_catCost_by_monthFromDeath_cnr_long[
  , `:=` (
    month_relative_to_DeathDate = as.numeric(gsub("[^0-9,-]", "",  category_and_month)),
    category_temp = gsub("[0-9]", "",  category_and_month) 
  )
  ][
    , `:=` (
      category = factor(substr(category_temp, 1, nchar(category_temp) - 1),
                        levels = c(hosp_a, hosp_b,
                                   "Outpatient", "Drugs", "Imaging", "Others")),
      category_temp = NULL,
      category_and_month = NULL
    )
    ]

# check that final number of obs is correct:
if ((nrow(unique(dt_catCost_by_monthFromDeath_cnr_long, 
                 by = c("id_var", "S_index_date_XX"))) !=
     nrow(data[DMG_died_within_365d == "1"])) |
    (nrow(dt_catCost_by_monthFromDeath_cnr_long)/(6*12) != 
     nrow(data[DMG_died_within_365d == "1"]))){
  stop("Problem with generating long dt (before death cost)")
}

# add new variable - number of months between index date and death
dt_catCost_by_monthFromDeath_cnr_long[
  , 
  months_from_dxDate2deathDate := 
    as.numeric(difftime(DMG_date_of_death_XX, 
                        S_index_date_XX, 
                        units = "days")) / 30 
  ][ # once again, the following correction is needed as {365 %% 30 = 5}
    months_from_dxDate2deathDate > 12,
    months_from_dxDate2deathDate := 12
    ]

dt_for_exhibit6_Death <- rbind(
  # by cat table
  dt_catCost_by_monthFromDeath_cnr_long[
    # keep only full month cost
    floor(months_from_dxDate2deathDate) >= month_relative_to_DeathDate
    ][, .(sum_cost = sum(cost)),
      by = .(id_var, S_index_date_XX, month_relative_to_DeathDate,
             category_short = factor(ifelse(category == hosp_a,
                                            hosp_a_name,
                                            ifelse(category == hosp_b,
                                            hosp_b_name,
                                            "All Other Services"))),
             months_from_dxDate2deathDate)][,
        .(ave_tot_cost = mean(sum_cost),
          num = .N),
        by = .(category_short, month_relative_to_DeathDate, 
               full_months = floor(months_from_dxDate2deathDate))][num >= min_obs_num],
  # total cost
  dt_catCost_by_monthFromDeath_cnr_long[
    # keep only full month cost
    floor(months_from_dxDate2deathDate) >= month_relative_to_DeathDate
    ][
      , .(sum_cost = sum(cost)),
      by = .(id_var, S_index_date_XX, month_relative_to_DeathDate,
             months_from_dxDate2deathDate)
      ][
        ,
        .(ave_tot_cost = mean(sum_cost),
          num = .N),
        by = .(month_relative_to_DeathDate, 
               full_months = floor(months_from_dxDate2deathDate))
        ][num >= min_obs_num][, category_short := "All Services"])[,
                            category_short := factor(category_short, 
                             levels = c("All Services", 
                                        hosp_a_name,
                                        hosp_b_name,
                                        "All Other Services"))]

# qa:
temp_qa <- dt_for_exhibit6_Death[, .(sum(ave_tot_cost)), by = .(category_short)]
if ((temp_qa[category_short == "All Services"]$V1 / sum(temp_qa[category_short != "All Services"]$V1)) < 0.99) {
  stop("Problem in dt_for_exhibit6")
}
rm(temp_qa)

# drop 12 month:
dt_for_exhibit6_Death <- dt_for_exhibit6_Death[full_months!=12]


# second version  from death 

write.csv(dt_for_exhibit6_Death,
          file = paste0("Cancer_Decedent_Spending_By_Time_b",hosp_a,"_",hosp_b,".csv"))


pdf(file = paste0("Cancer_Decedent_Spending_By_Time_b",hosp_a,"_",hosp_b,".pdf"))
print(ggplot(
  data = dt_for_exhibit6_Death[category_short!="All Services"],
  aes(x = month_relative_to_DeathDate, 
      y = ave_tot_cost, 
      group = full_months,
      color = full_months))+
  geom_line(size=1) +
  facet_grid( ~ category_short) +
  scale_x_reverse(breaks = rev(seq(1, 12, 1)), labels = rev(seq(1, 12, 1)) ) +
  # scale_x_continuous(breaks = rev(seq(1, 12, 1)), labels = rev(seq(1, 12, 1)) ) + 
  scale_y_continuous(labels = scales::comma, breaks = seq(0,9000,3000), limits = c(0,10000)) +
  expand_limits(y = 0) + 
  scale_color_continuous(low = "midnightblue", high = "lightblue1",
                         name = "Full Months Survived",
                         trans="reverse",
                         breaks=c(1, 3, 5, 7, 9, 11),
                         guide=guide_colourbar(reverse = TRUE)) +
  labs(x = "Month From Death Date",
       y = "Average Monthly Spending (NIS)")+
  theme(axis.text.x = element_text(size=base_size*0.7),
        text = element_text(size = 24))
)
dev.off()

}



